/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <regex.h>

#include "udm_config.h"
#include "udm_common.h"
#include "udm_utils.h"
#include "udm_proto.h"
#include "udm_url.h"
#include "udm_hrefs.h"
#include "udm_server.h"
#include "udm_xmalloc.h"
#include "udm_host.h"
#include "udm_vars.h"

#define DEFAULT_PROXY_PORT	3128
#define ERRSTRSIZ 1000

/* return values: 0 on success, non-zero on error */

__INDLIB__ int UdmServerAdd(UDM_ENV *Conf,UDM_SERVER *srv){
	int		res;
	UDM_URL		from;
	regex_t		*pexpr=NULL;
	char		urlstr[UDM_URLSIZE];
	UDM_SERVER	*new;
	size_t		i;
	
	/* Copy URL to temp string    */
	/* to keep srv->url unchanged */
	strncpy(urlstr,UDM_NULL2EMPTY(srv->url),sizeof(urlstr)-1);
	
	if((UDM_SRV_TYPE(srv->match_type)==UDM_SERVER_SUBSTR) && (urlstr[0])){
		
		/* Check whether valid URL is passed */
		if((res=UdmURLParse(&from,urlstr))){
			switch(res){
				case UDM_URL_LONG:
					Conf->errcode=1;
					sprintf(Conf->errstr,"URL too long");
					break;
				case UDM_URL_BAD:
				default:
					Conf->errcode=1;
					sprintf(Conf->errstr,"Badly formed URL");
					break;
			}
			return(1);
		}
		if((from.hostinfo[0])&&(!from.filename[0])){
			/* Add trailing slash                    */
			/* http://localhost -> http://localhost/ */
			snprintf(urlstr,sizeof(urlstr)-1,"%s://%s%s",from.schema,from.hostinfo,from.path);
			urlstr[sizeof(urlstr)-1]='\0';
		}
		
		switch(srv->Spider.follow){
			char * s, * anchor;
			case UDM_FOLLOW_PATH:
				/* Cut before '?' and after last '/' */
				if((anchor=strchr(urlstr,'?')))
					*anchor='\0';
				if((s=strrchr(urlstr,'/')))
					*(s+1)='\0';
				break;

			case UDM_FOLLOW_SITE:
				if(from.hostinfo[0]){
					/* Cut after hostinfo */
					snprintf(urlstr,sizeof(urlstr)-1,"%s://%s/",from.schema,from.hostinfo);
				}else{
					/* Cut after first '/' */
					if((s=strchr(urlstr,'/')))
						*(s+1)='\0';
				}
				break;
			
			case UDM_FOLLOW_NO: 
			case UDM_FOLLOW_WORLD:
			default:
				break;
		}
	}else
	if(UDM_SRV_TYPE(srv->match_type)==UDM_SERVER_REGEX){
		int err;
		char regerrstr[ERRSTRSIZ]="";
		pexpr=UdmXmalloc(sizeof(regex_t));
		if(srv->match_type&UDM_SERVER_CS){
			err=regcomp(pexpr,urlstr,REG_EXTENDED);
		}else{
			err=regcomp(pexpr,urlstr,REG_EXTENDED|REG_ICASE);
		}
		if(err){
			regerror(err, pexpr, regerrstr, ERRSTRSIZ);
			free(pexpr);
			Conf->errcode=1;
			snprintf(Conf->errstr,sizeof(Conf->errstr),"Wrong regex in config file: %s: %s", urlstr,regerrstr);
			return(1);
		}
	}
	if(Conf->Servers.nservers>=Conf->Servers.mservers){
		Conf->Servers.mservers+=16;
		Conf->Servers.Server=(UDM_SERVER *)UdmXrealloc(Conf->Servers.Server,Conf->Servers.mservers*sizeof(UDM_SERVER));
	}
	new=&Conf->Servers.Server[Conf->Servers.nservers];
	UdmServerInit(new);
	
	for(i=0;i<srv->ExtraHeaders.nvars;i++){
		UdmVarListAdd(&new->ExtraHeaders,&srv->ExtraHeaders.Var[i]);
	}
	
	for(i=0;i<srv->Vars.nvars;i++){
		UdmVarListAdd(&new->Vars,&srv->Vars.Var[i]);
	}
	
	new->rec_id=Conf->Servers.nservers;
	new->regexp=pexpr;
	new->match_type=srv->match_type;
	new->url=strdup(urlstr);
	new->alias=srv->alias?strdup(srv->alias):NULL;
	new->Spider=srv->Spider;
	
	Conf->Servers.nservers++;
	return(0);
}

void UdmServerFree(UDM_SERVER *Server){
	UDM_FREE(Server->url);
	UDM_FREE(Server->alias);
	if(Server->regexp){
		regfree((regex_t*)(Server->regexp));
		free(Server->regexp);
	}
	UdmVarListFree(&Server->Vars);
	UdmVarListFree(&Server->ExtraHeaders);
}

void UdmServerListFree(UDM_SERVERLIST *List){
	size_t i;
	
	for(i=0;i<List->nservers;i++)
		UdmServerFree(&List->Server[i]);
	
	List->nservers=List->mservers=0;
	UDM_FREE(List->Server);
}

/* This fuction finds Server entry for given URL         */
/* and return Alias in "aliastr" if it is not NULL       */
/* "aliastr" must be big enough to store result          */
/* not more than UDM_URLSTR bytes are written to aliastr */

UDM_SERVER * UdmServerFind(UDM_SERVERLIST *List,
			const char *url, const char *net, char * aliastr){
#define NS 10

	size_t i;
	char *robots=NULL;
	UDM_SERVER * Res=NULL;

	/* If it's a robot.txt, cut to hostinfo and find result */
	if((robots=strstr(url,"/robots.txt"))){
		if(!strcmp(robots,"/robots.txt")){
			robots=strdup(url);
			robots[strlen(url)-10]='\0';
		}else{
			robots=NULL;
		}
	}

	for(i=0;i<List->nservers;i++){
 		int res;
 		regmatch_t subs[NS];
 		UDM_SERVER *srv=&List->Server[i];
 		
		switch(UDM_SRV_TYPE(srv->match_type)){
    			case UDM_SERVER_REGEX:
				res=regexec(srv->regexp,url,NS,subs,0);
				if((!res)&&(srv->alias)&&(aliastr)){
					char *dst=aliastr;
					const char *src=srv->alias;
					
					while((*src)&&((dst-aliastr)<(UDM_URLSIZE-1))){
						if(*src=='$'){
							char digit[2];
							int sub;
							size_t len;
							
							digit[0]=src[1];
							digit[1]='\0';
							sub=atoi(digit);
							len=subs[sub].rm_eo-subs[sub].rm_so;
							strncpy(dst,url+subs[sub].rm_so,len);
#ifdef DEBUG_ALIAS
							fprintf(stderr,"Match %d-%d '%s'\n",(int)subs[sub].rm_so,(int)subs[sub].rm_eo,dst);
#endif
							dst+=len;*dst='\0';
							src+=2;
						}else{
							*dst=*src;
							dst++;*dst='\0';
							src++;
						}
					}
					*dst='\0';
#ifdef DEBUG_ALIAS							
					fprintf(stderr,"'%s' '%s' '%s' '%s'\n",url,Conf->Server[i].url,Conf->Server[i].alias,aliastr);
#endif
				}
					
				break;
			case UDM_SERVER_STRING:
				if(srv->match_type&UDM_SERVER_CS){
					res=UdmStrMatch(url,srv->url);
				}else{
					res=UdmStrCaseMatch(url,srv->url);
				}
				break;
			case UDM_SERVER_SUBNET:
				res=UdmStrMatch(net,srv->url);
				break;
			case UDM_SERVER_SUBSTR:
			default:
				if(robots){
					if(srv->match_type&UDM_SERVER_CS){
						res=UDM_STRNCMP(srv->url,robots);
					}else{
						res=UDM_STRNCASECMP(srv->url,robots);
					}
				}else{
					if(srv->Spider.follow==UDM_FOLLOW_NO){
						if(srv->match_type&UDM_SERVER_CS){
							res=strcmp(url,srv->url);
						}else{
							res=strcasecmp(url,srv->url);
						}
					}else{
						if(srv->match_type&UDM_SERVER_CS){
							res=UDM_STRNCMP(url,srv->url);
						}else{
							res=UDM_STRNCMP(url,srv->url);
						}
					}
				}
				if((aliastr)&&(!res)&&(srv->alias)&&((strlen(url)-strlen(srv->url)+strlen(srv->alias)))<UDM_URLSIZE){
					sprintf(aliastr,"%s%s",srv->alias,url+strlen(srv->url));
				}
				break;
		}
		if((!(srv->match_type&UDM_SERVER_MATCH))&&(res)){
			Res=srv;
			break;
		}else if((srv->match_type&UDM_SERVER_MATCH)&&(!res)){
			Res=srv;
			break;
		}
	}
	UDM_FREE(robots);
	return(Res);
}

static int cmpserver(const void *s1,const void *s2){
	int res;
	
	if(!(res=strlen(((const UDM_SERVER*)s2)->url)-strlen(((const UDM_SERVER*)s1)->url)))
		res=(((const UDM_SERVER*)s2)->rec_id)-(((const UDM_SERVER*)s1)->rec_id);
	return(res);
}
void UdmServerListSort(UDM_SERVERLIST *List){
	/*  Long name should be found first    */
	/*  to allow different options         */
	/*  for server and it's subdirectories */
	qsort(List->Server,List->nservers,sizeof(UDM_SERVER),cmpserver);
}
__INDLIB__ int UdmServerInit(UDM_SERVER * srv){
	bzero(srv,sizeof(*srv));

	srv->match_type=UDM_SERVER_SUBSTR|UDM_SERVER_MATCH;
	srv->Spider.period=UDM_DEFAULT_REINDEX_TIME;
	srv->Spider.max_net_errors=UDM_MAXNETERRORS;
	srv->Spider.read_timeout=UDM_READ_TIMEOUT;
	srv->Spider.doc_timeout=UDM_DOC_TIMEOUT;
	srv->Spider.maxhops=UDM_DEFAULT_MAX_HOPS;
	srv->Spider.index=1;
	srv->Spider.follow=UDM_FOLLOW_PATH;
	srv->Spider.use_robots=1;
	srv->Spider.use_clones=1;
	srv->Spider.net_error_delay_time=UDM_DEFAULT_NET_ERROR_DELAY_TIME;
	
	return(0);
}
